Purpose

Look at the sampling effort for the various WQ parameters from discretewq to be included in the long-term WQ publication.

Global code and functions

# Load packages
library(tidyverse)
library(lubridate)
library(hms)
library(scales)
library(discretewq)
library(deltamapr)
library(sf)
library(leaflet)
library(dtplyr)
library(here)
# Check if we are in the correct working directory
i_am("Sampling_Effort_discretewq.Rmd")
## here() starts at C:/Repositories/04_IEP_Org/WQ-LT-Publication
# Run session info to display package versions
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.2.1 (2022-06-23 ucrt)
##  os       Windows 10 x64 (build 19044)
##  system   x86_64, mingw32
##  ui       RTerm
##  language (EN)
##  collate  English_United States.utf8
##  ctype    English_United States.utf8
##  tz       America/Los_Angeles
##  date     2022-08-23
##  pandoc   2.18 @ C:/Program Files/RStudio/bin/quarto/bin/tools/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package       * version date (UTC) lib source
##  assertthat      0.2.1   2019-03-21 [1] CRAN (R 4.2.1)
##  backports       1.4.1   2021-12-13 [1] CRAN (R 4.2.0)
##  broom           1.0.0   2022-07-01 [1] CRAN (R 4.2.1)
##  bslib           0.4.0   2022-07-16 [1] CRAN (R 4.2.1)
##  cachem          1.0.6   2021-08-19 [1] CRAN (R 4.2.1)
##  callr           3.7.1   2022-07-13 [1] CRAN (R 4.2.1)
##  cellranger      1.1.0   2016-07-27 [1] CRAN (R 4.2.1)
##  class           7.3-20  2022-01-16 [2] CRAN (R 4.2.1)
##  classInt        0.4-7   2022-06-10 [1] CRAN (R 4.2.1)
##  cli             3.3.0   2022-04-25 [1] CRAN (R 4.2.1)
##  colorspace      2.0-3   2022-02-21 [1] CRAN (R 4.2.1)
##  crayon          1.5.1   2022-03-26 [1] CRAN (R 4.2.1)
##  crosstalk       1.2.0   2021-11-04 [1] CRAN (R 4.2.1)
##  data.table      1.14.2  2021-09-27 [1] CRAN (R 4.2.1)
##  DBI             1.1.3   2022-06-18 [1] CRAN (R 4.2.1)
##  dbplyr          2.2.1   2022-06-27 [1] CRAN (R 4.2.1)
##  deltamapr     * 1.0.0   2021-06-18 [1] Github (InteragencyEcologicalProgram/deltamapr@d0a6f9c)
##  devtools        2.4.4   2022-07-20 [1] CRAN (R 4.2.1)
##  digest          0.6.29  2021-12-01 [1] CRAN (R 4.2.1)
##  discretewq    * 2.3.2   2022-08-18 [1] local
##  dplyr         * 1.0.9   2022-04-28 [1] CRAN (R 4.2.1)
##  dtplyr        * 1.2.1   2022-01-19 [1] CRAN (R 4.2.1)
##  e1071           1.7-11  2022-06-07 [1] CRAN (R 4.2.1)
##  ellipsis        0.3.2   2021-04-29 [1] CRAN (R 4.2.1)
##  evaluate        0.15    2022-02-18 [1] CRAN (R 4.2.1)
##  fansi           1.0.3   2022-03-24 [1] CRAN (R 4.2.1)
##  fastmap         1.1.0   2021-01-25 [1] CRAN (R 4.2.1)
##  forcats       * 0.5.1   2021-01-27 [1] CRAN (R 4.2.1)
##  fs              1.5.2   2021-12-08 [1] CRAN (R 4.2.1)
##  gargle          1.2.0   2021-07-02 [1] CRAN (R 4.2.1)
##  generics        0.1.3   2022-07-05 [1] CRAN (R 4.2.1)
##  ggplot2       * 3.3.6   2022-05-03 [1] CRAN (R 4.2.1)
##  glue            1.6.2   2022-02-24 [1] CRAN (R 4.2.1)
##  googledrive     2.0.0   2021-07-08 [1] CRAN (R 4.2.1)
##  googlesheets4   1.0.0   2021-07-21 [1] CRAN (R 4.2.1)
##  gtable          0.3.0   2019-03-25 [1] CRAN (R 4.2.1)
##  haven           2.5.0   2022-04-15 [1] CRAN (R 4.2.1)
##  here          * 1.0.1   2020-12-13 [1] CRAN (R 4.2.1)
##  hms           * 1.1.1   2021-09-26 [1] CRAN (R 4.2.1)
##  htmltools       0.5.3   2022-07-18 [1] CRAN (R 4.2.1)
##  htmlwidgets     1.5.4   2021-09-08 [1] CRAN (R 4.2.1)
##  httpuv          1.6.5   2022-01-05 [1] CRAN (R 4.2.1)
##  httr            1.4.3   2022-05-04 [1] CRAN (R 4.2.1)
##  jquerylib       0.1.4   2021-04-26 [1] CRAN (R 4.2.1)
##  jsonlite        1.8.0   2022-02-22 [1] CRAN (R 4.2.1)
##  KernSmooth      2.23-20 2021-05-03 [2] CRAN (R 4.2.1)
##  knitr           1.39    2022-04-26 [1] CRAN (R 4.2.1)
##  later           1.3.0   2021-08-18 [1] CRAN (R 4.2.1)
##  leaflet       * 2.1.1   2022-03-23 [1] CRAN (R 4.2.1)
##  lifecycle       1.0.1   2021-09-24 [1] CRAN (R 4.2.1)
##  lubridate     * 1.8.0   2021-10-07 [1] CRAN (R 4.2.1)
##  magrittr        2.0.3   2022-03-30 [1] CRAN (R 4.2.1)
##  memoise         2.0.1   2021-11-26 [1] CRAN (R 4.2.1)
##  mime            0.12    2021-09-28 [1] CRAN (R 4.2.0)
##  miniUI          0.1.1.1 2018-05-18 [1] CRAN (R 4.2.1)
##  modelr          0.1.8   2020-05-19 [1] CRAN (R 4.2.1)
##  munsell         0.5.0   2018-06-12 [1] CRAN (R 4.2.1)
##  pillar          1.8.0   2022-07-18 [1] CRAN (R 4.2.1)
##  pkgbuild        1.3.1   2021-12-20 [1] CRAN (R 4.2.1)
##  pkgconfig       2.0.3   2019-09-22 [1] CRAN (R 4.2.1)
##  pkgload         1.3.0   2022-06-27 [1] CRAN (R 4.2.1)
##  prettyunits     1.1.1   2020-01-24 [1] CRAN (R 4.2.1)
##  processx        3.7.0   2022-07-07 [1] CRAN (R 4.2.1)
##  profvis         0.3.7   2020-11-02 [1] CRAN (R 4.2.1)
##  promises        1.2.0.1 2021-02-11 [1] CRAN (R 4.2.1)
##  proxy           0.4-27  2022-06-09 [1] CRAN (R 4.2.1)
##  ps              1.7.1   2022-06-18 [1] CRAN (R 4.2.1)
##  purrr         * 0.3.4   2020-04-17 [1] CRAN (R 4.2.1)
##  R6              2.5.1   2021-08-19 [1] CRAN (R 4.2.1)
##  Rcpp            1.0.9   2022-07-08 [1] CRAN (R 4.2.1)
##  readr         * 2.1.2   2022-01-30 [1] CRAN (R 4.2.1)
##  readxl          1.4.0   2022-03-28 [1] CRAN (R 4.2.1)
##  remotes         2.4.2   2021-11-30 [1] CRAN (R 4.2.1)
##  reprex          2.0.1   2021-08-05 [1] CRAN (R 4.2.1)
##  rlang           1.0.4   2022-07-12 [1] CRAN (R 4.2.1)
##  rmarkdown       2.14    2022-04-25 [1] CRAN (R 4.2.1)
##  rprojroot       2.0.3   2022-04-02 [1] CRAN (R 4.2.1)
##  rstudioapi      0.13    2020-11-12 [1] CRAN (R 4.2.1)
##  rvest           1.0.2   2021-10-16 [1] CRAN (R 4.2.1)
##  sass            0.4.2   2022-07-16 [1] CRAN (R 4.2.1)
##  scales        * 1.2.0   2022-04-13 [1] CRAN (R 4.2.1)
##  sessioninfo     1.2.2   2021-12-06 [1] CRAN (R 4.2.1)
##  sf            * 1.0-8   2022-07-14 [1] CRAN (R 4.2.1)
##  shiny           1.7.2   2022-07-19 [1] CRAN (R 4.2.1)
##  stringi         1.7.8   2022-07-11 [1] CRAN (R 4.2.1)
##  stringr       * 1.4.0   2019-02-10 [1] CRAN (R 4.2.1)
##  tibble        * 3.1.8   2022-07-22 [1] CRAN (R 4.2.1)
##  tidyr         * 1.2.0   2022-02-01 [1] CRAN (R 4.2.1)
##  tidyselect      1.1.2   2022-02-21 [1] CRAN (R 4.2.1)
##  tidyverse     * 1.3.2   2022-07-18 [1] CRAN (R 4.2.1)
##  tzdb            0.3.0   2022-03-28 [1] CRAN (R 4.2.1)
##  units           0.8-0   2022-02-05 [1] CRAN (R 4.2.1)
##  urlchecker      1.0.1   2021-11-30 [1] CRAN (R 4.2.1)
##  usethis         2.1.6   2022-05-25 [1] CRAN (R 4.2.1)
##  utf8            1.2.2   2021-07-24 [1] CRAN (R 4.2.1)
##  vctrs           0.4.1   2022-04-13 [1] CRAN (R 4.2.1)
##  withr           2.5.0   2022-03-03 [1] CRAN (R 4.2.1)
##  xfun            0.31    2022-05-10 [1] CRAN (R 4.2.1)
##  xml2            1.3.3   2021-11-30 [1] CRAN (R 4.2.1)
##  xtable          1.8-4   2019-04-21 [1] CRAN (R 4.2.1)
##  yaml            2.3.5   2022-02-21 [1] CRAN (R 4.2.1)
## 
##  [1] C:/R/win-library/4.2
##  [2] C:/Program Files/R/R-4.2.1/library
## 
## ──────────────────────────────────────────────────────────────────────────────

Import and Prepare Data

# Pull in and prepare the WQ data from discretewq
df_dwq <-
  wq(
    Sources = c(
      "EMP",
      "STN",
      "FMWT",
      "EDSM",
      "DJFMP",
      "SDO",
      "SKT",
      "SLS",
      "20mm",
      "Suisun",
      "Baystudy",
      "USBR",
      "USGS_SFBS",
      "YBFMP",
      "USGS_CAWSC"
    )
  ) %>% 
  transmute(
    Source,
    Station,
    Latitude,
    Longitude,
    Date = date(Date),
    # Convert Datetime to PST
    Datetime = with_tz(Datetime, tzone = "Etc/GMT+8"),
    Temperature,
    Salinity,
    Secchi,
    Chlorophyll,
    DissAmmonia,
    DissNitrateNitrite,
    DissOrthophos
  ) %>% 
  filter(
    !if_all(
      c(Temperature, Salinity, Secchi, Chlorophyll, DissAmmonia, DissNitrateNitrite, DissOrthophos),
      is.na
    )
  )

Temporal Coverage

Now that we’ve been making updates to the WQ data in the discretewq package, let’s take a look at which surveys we can use for the long-term WQ publication. First, we’ll look at the temporal scale of all of the surveys available.

df_dwq %>% 
  group_by(Source) %>% 
  summarize(min_date = min(Date), max_date = max(Date)) %>% 
  arrange(min_date)
## # A tibble: 15 × 3
##    Source     min_date   max_date  
##    <chr>      <date>     <date>    
##  1 FMWT       1967-09-12 2021-12-16
##  2 USGS_SFBS  1969-04-10 2022-05-09
##  3 STN        1969-07-05 2021-08-19
##  4 USGS_CAWSC 1971-03-02 2021-03-10
##  5 EMP        1975-01-07 2021-12-16
##  6 DJFMP      1976-05-13 2021-12-29
##  7 Suisun     1979-05-16 2021-08-25
##  8 Baystudy   1980-01-23 2020-12-01
##  9 20mm       1995-04-24 2021-07-16
## 10 SDO        1997-08-04 2018-11-08
## 11 YBFMP      1998-01-19 2021-12-30
## 12 SKT        2002-01-07 2021-04-29
## 13 SLS        2009-01-05 2021-03-17
## 14 USBR       2012-05-08 2019-10-22
## 15 EDSM       2016-12-15 2021-11-26

Overall, for all parameters, it looks like FMWT, USGS_SFBS, STN, USGS_CAWSC, EMP, DJFMP, Suisun, and Baystudy have adequate temporal coverage for the long-term analysis. Now let’s take a look at the spatial coverage of these surveys.

Spatial Coverage

# Only include surveys with adequate temporal coverage
df_dwq_lt <- df_dwq %>% 
  filter(Source %in% c("FMWT", "USGS_SFBS", "STN", "USGS_CAWSC", "EMP", "DJFMP", "Suisun", "Baystudy"))

# Pull out station coordinates and convert to sf object
sf_dwq_lt <- df_dwq_lt %>% 
  distinct(Source, Station, Latitude, Longitude) %>% 
  drop_na(Latitude, Longitude) %>% 
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>% 
  st_transform(st_crs(R_EDSM_Subregions_Mahardja_FLOAT))

# Load Delta Shapefile from Brian and only keep SubRegions east of Carquinez Straight
sf_delta <- R_EDSM_Subregions_Mahardja_FLOAT %>% 
  filter(
    !SubRegion %in% c(
      "Carquinez Strait", 
      "Lower Napa River", 
      "San Francisco Bay",
      "San Pablo Bay",
      "South Bay",
      "Upper Napa River" 
    )
  ) %>% 
  select(SubRegion)

# Plot all stations over the SubRegions
ggplot() +
  geom_sf(data = sf_delta, fill = "green", alpha = 0.5) +
  geom_sf(data = sf_dwq_lt) +
  theme_bw()

# Assign SubRegions to the stations
sf_dwq_lt_reg <- sf_dwq_lt %>% 
  st_join(sf_delta, join = st_intersects) %>%
  filter(!is.na(SubRegion))

# Remove SubRegions without stations from sf_delta
sf_delta_c <- sf_delta %>% filter(SubRegion %in% unique(sf_dwq_lt_reg$SubRegion))

# Plot all stations over the SubRegions again - with limited ranges
ggplot() +
  geom_sf(data = sf_delta_c, fill = "blue", alpha = 0.5) +
  geom_sf(data = sf_dwq_lt_reg) +
  theme_bw()

All but one of the SubRegions east of Carquinez Straight has at least one station from a long-term survey within it. Now let’s take a closer look at the temporal data coverage for each Station and parameter.

Temporal Coverage by Station

# Prepare long-term data from discretewq
df_dwq_lt_c <- df_dwq_lt %>% 
  # Remove records without latitude-longitude coordinates
  drop_na(Latitude, Longitude) %>%
  # Convert to sf object
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE) %>%
  # Change to crs of sf_delta_c
  st_transform(crs = st_crs(sf_delta_c)) %>%
  # Add subregions
  st_join(sf_delta_c, join = st_intersects) %>%
  # Remove any data outside our subregions of interest
  filter(!is.na(SubRegion)) %>%
  # Drop sf geometry column since it's no longer needed
  st_drop_geometry() %>% 
  # Add variables for adjusted calendar year, month, and season
    # Adjusted calendar year: December-November, with December of the previous calendar year
    # included with the following year
  mutate(
    Month = month(Date),
    YearAdj = if_else(Month == 12, year(Date) + 1, year(Date)),
    Season = case_when(
      Month %in% 3:5 ~ "Spring",
      Month %in% 6:8 ~ "Summer",
      Month %in% 9:11 ~ "Fall",
      Month %in% c(12, 1, 2) ~ "Winter"
    )
  ) %>% 
  # Restrict data to 1975-2021
  filter(YearAdj %in% 1975:2021)

# Prepare data separately for each parameter
prep_samp_effort <- function(df, param) {
  # Remove any NA values in parameter of interest
  df_param <- df %>% drop_na(.data[[param]])
  
  # Look for any instances when more than 1 data point was collected at a station-day
  df_dups <- df_param %>%
    count(Source, Station, Date) %>% 
    filter(n > 1) %>% 
    select(-n)
  
  # Fix duplicates
  df_dups_fixed <- df_param %>%
    inner_join(df_dups, by = c("Source", "Station", "Date")) %>%
    drop_na(Datetime) %>%
    mutate(
      # Create variable for time
      Time = as_hms(Datetime),
      # Calculate difference from noon for each data point for later filtering
      Noon_diff = abs(hms(hours = 12) - Time)
    ) %>%
    # Use dtplyr to speed up operations
    lazy_dt() %>%
    group_by(Station, Date) %>%
    # Select only 1 data point per station and date, choose data closest to noon
    filter(Noon_diff == min(Noon_diff)) %>%
    # When points are equidistant from noon, select earlier point
    filter(Time == min(Time)) %>%
    ungroup() %>%
    # End dtplyr operation
    as_tibble() %>%
    select(-c(Time, Noon_diff))

  # Add back fixed duplicates and format data frame
  df_param %>%
    anti_join(df_dups, by = c("Source", "Station", "Date")) %>%
    bind_rows(df_dups_fixed) %>% 
    select(
      Source,
      Station,
      Latitude,
      Longitude,
      SubRegion,
      YearAdj,
      Month,
      Season,
      Date,
      Datetime,
      .data[[param]]
    )
}

# Create a nested data frame to run functions on
ndf_dwq_lt <- 
  tibble(
    Parameter = c(
      "Temperature",
      "Salinity",
      "Secchi",
      "DissAmmonia",
      "DissNitrateNitrite",
      "DissOrthophos",
      "Chlorophyll"
    ),
    df_data = rep(list(df_dwq_lt_c), 7)
  ) %>% 
  # Prepare data for each Parameter
  mutate(df_data = map2(df_data, Parameter, .f = prep_samp_effort))

Plots - Sampling Effort by Station

# Plot function
plot_samp_effort_sta <- function(df) {
  df %>%
    count(Station, YearAdj, name = "num_samples") %>%
    mutate(Station = fct_rev(factor(Station))) %>%
    ggplot(aes(x = YearAdj, y = Station, fill = num_samples)) +
    geom_tile() +
    scale_x_continuous(
      limits = c(1974, 2022),
      breaks = breaks_pretty(20), 
      expand = expansion()
    ) +
    scale_fill_viridis_c(name = "Number of Samples") +
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      legend.position = "top"
    )
}

# Create sampling effort by station plots for each Parameter and Source
ndf_dwq_lt_se_sta_plt <- ndf_dwq_lt %>% 
  mutate(
    df_data = map(df_data, ~ nest(.x, df_data2 = -Source)),
    df_data = modify_depth(df_data, 1, ~ mutate(.x, plt = map(df_data2, plot_samp_effort_sta)))
  )

Temperature

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

Salinity

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

USGS_SFBS

Secchi

FMWT

Baystudy

STN

Suisun

DJFMP

EMP

DissAmmonia

EMP

USGS_SFBS

USGS_CAWSC

DissNitrateNitrite

EMP

USGS_SFBS

USGS_CAWSC

DissOrthophos

EMP

USGS_SFBS

USGS_CAWSC

Chlorophyll

EMP

USGS_SFBS

USGS_CAWSC

Restrict Stations and SubRegions

Not all of the stations from the long-term surveys were sampled consistently from 1975-2021. We need to make sure that we are only using SubRegions that have adequate sample coverage in our analyses. We’ll start with removing the obvious survey-parameter combinations that were only sparsely sampled.

Remove Sparse Surveys

DJFMP only sampled salinity for the past three years, so we won’t include this survey in the salinity analysis. For the USGS-CAWSC stations, only station 11447650 (Sacramento River at Freeport) was sampled on a long-term basis for Dissolved Ammonia, Nitrate+Nitrite, and Orthophosphate. Also, Chlorophyll wasn’t sampled consistently at any of the USGS-CAWSC stations. We’ll exclude this data from our data set before continuing.

ndf_dwq_lt <- ndf_dwq_lt %>% 
  mutate(
    df_data_filt = case_when(
      Parameter == "Salinity" ~ modify_depth(df_data, 1, ~ filter(.x, Source != "DJFMP")),
      str_detect(Parameter, "^Diss") ~ modify_depth(
        df_data, 1, ~ filter(.x, !(Source == "USGS_CAWSC" & Station != "USGS-11447650"))),
      Parameter == "Chlorophyll" ~ modify_depth(df_data, 1, ~ filter(.x, Source != "USGS_CAWSC")),
      TRUE ~ df_data
    )
  )

Suisun Marsh Survey

Some of the stations from the Suisun Marsh survey are located in small backwater channels and dead-end sloughs which represent a much different habitat than the sampling locations from the other surveys which tend to be in larger, open water channel habitat. We’ll take a closer look at the Suisun Marsh survey, and only include stations that are in the larger channels to be consistent with the other surveys.

# Pull out stations that are within the SubRegions containing the Suisun Marsh survey - 
  # Water temperature has a complete list of all stations collected by the Suisun
  # Marsh survey, so we'll go with that
suisun_subreg <- ndf_dwq_lt$df_data_filt[[1]] %>% 
  filter(Source == "Suisun") %>% 
  distinct(SubRegion) %>% 
  pull(SubRegion)
  
sf_suisun_sta <- ndf_dwq_lt$df_data_filt[[1]] %>% 
  filter(SubRegion %in% suisun_subreg) %>% 
  distinct(Source, Station, Latitude, Longitude) %>% 
  # Convert to sf object
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326)

# Define color palette for Surveys 
color_pal_survey <- colorFactor(palette = "viridis", domain = sf_suisun_sta$Source)

# Create map using leaflet
leaflet() %>% 
  addTiles() %>%
  addCircleMarkers(
    data = sf_suisun_sta,
    radius = 5,
    fillColor = ~color_pal_survey(Source),
    fillOpacity = 0.8,
    weight = 0.5,
    color = "black",
    opacity = 1,
    label = paste0("Survey: ", sf_suisun_sta$Source, ", Station: ", sf_suisun_sta$Station)
  ) %>% 
  addLegend(
    position = "topright",
    pal = color_pal_survey,
    values = sf_suisun_sta$Source,
    title = "Survey:"
  )

We’ll keep the stations located in Suisun and Montezuma Sloughs from the Suisun Marsh survey, since they seem to be in the larger channels in the area.

ndf_dwq_lt2 <- ndf_dwq_lt %>% 
  mutate(
    df_data_filt = map(
      df_data_filt, 
      ~ filter(.x, !(Source == "Suisun" & !str_detect(Station, "^SU|^MZ")))
    )
  )

Sampling Effort by SubRegion

Before we start removing SubRegions and stations, let’s take a look at the sampling effort for each SubRegion and parameter combination.

# Plot function
plot_samp_effort_subreg <- function(df) {
  # Create a vector of all possible SubRegions
  subreg_all <- sf_delta %>% distinct(SubRegion) %>% pull(SubRegion)
  
  # Create plot
  df %>%
    count(SubRegion, YearAdj, name = "num_samples") %>%
    mutate(SubRegion = fct_rev(factor(SubRegion, levels = subreg_all))) %>% 
    ggplot(aes(x = YearAdj, y = SubRegion, fill = num_samples)) +
    geom_tile() +
    scale_x_continuous(
      limits = c(1974, 2022),
      breaks = breaks_pretty(20), 
      expand = expansion()
    ) +
    scale_y_discrete(drop = FALSE) +
    scale_fill_viridis_c(name = "Number of Samples") +
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
      legend.position = "top"
    )
}

# Create sampling effort by SubRegion plots for each Parameter
ndf_dwq_lt_se_subreg_plt <- ndf_dwq_lt2 %>% mutate(plt = map(df_data_filt, plot_samp_effort_subreg))

Temperature

Salinity

Secchi

DissAmmonia

DissNitrateNitrite

DissOrthophos

Chlorophyll

Filter Stations

Since not all of the SubRegions were sampled consistently from 1975-2021, we’ll only keep the SubRegions that contain at least one station that was sampled a majority of the 47 years between 1975 to 2021. To determine an adequate threshold for number of years sampled, we’ll take a look at histograms of the number of years sampled across all stations for each parameter binned by survey.

Plots - Number of Years Sampled

ndf_dwq_lt_yr_hist <- ndf_dwq_lt2 %>% 
  mutate(
    # Count the number of years sampled for each Station and Parameter
    df_count = map(
      df_data_filt, 
      ~ distinct(.x, Source, Station, YearAdj) %>% 
        count(Source, Station, name = "num_years")
    ),
    # Create histograms of number of years sampled for each Parameter
    plt_count = map(
      df_count,
      ~ ggplot(.x, aes(num_years)) + 
        geom_histogram(bins = 30) + 
        facet_wrap(vars(Source), scales = "free_y")
    )
  )

Temperature

Salinity

Secchi

DissAmmonia

DissNitrateNitrite

DissOrthophos

Chlorophyll

Define “Core” Stations

After looking at the histograms, we’ll define the long-term or “Core” stations as those that were sampled at least 80% of years between 1975 to 2021, which is 38 years.

ndf_dwq_lt2 <- ndf_dwq_lt2 %>% 
  mutate(
    df_stations = map(
      df_data_filt,
      ~ distinct(.x, Source, Station, YearAdj) %>% 
        group_by(Source, Station) %>% 
        mutate(
          NumYrs = n(),
          StationGrp = case_when(
            Source == "EMP" & str_detect(Station, "^C3|^MD10") ~ "Core",
            NumYrs >= 38 ~ "Core", 
            TRUE ~ "Secondary"
          )
        ) %>% 
        ungroup() %>% 
        distinct(Source, Station, NumYrs, StationGrp)
    )
  )

Stations Map - before filtering

Let’s look at the map of “Core” and “Secondary” stations for water temperature before removing any stations and SubRegions from the analysis. This map excludes EMP’s EZ stations for better visibility of the static stations.

sf_sta_wt_before <- ndf_dwq_lt2$df_data_filt[[1]] %>% 
  distinct(Source, Station, Latitude, Longitude, SubRegion) %>% 
  left_join(ndf_dwq_lt2$df_stations[[1]], by = c("Source", "Station")) %>% 
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE) %>% 
  filter(!(Source == "EMP" & str_detect(Station, "^EZ")))

# Convert CRS of Region polygon to WGS84 so that it works with the leaflet map
sf_delta_c_4326 <- st_transform(sf_delta_c, crs = 4326)

# Define color palette for Station Groups 
color_pal_sta_grp <- colorFactor(palette = "viridis", domain = sf_sta_wt_before$StationGrp)

# Create map using leaflet
leaflet() %>% 
  addTiles() %>%
  addCircleMarkers(
    data = sf_sta_wt_before,
    radius = 5,
    fillColor = ~color_pal_sta_grp(StationGrp),
    fillOpacity = 0.8,
    weight = 0.5,
    color = "black",
    opacity = 1,
    label = paste0("Survey: ", sf_sta_wt_before$Source, ", Station: ", sf_sta_wt_before$Station)
  ) %>% 
  addPolylines(data = sf_delta_c_4326) %>%
  addLegend(
    position = "topright",
    pal = color_pal_sta_grp,
    values = sf_sta_wt_before$StationGrp,
    title = "Station Group:"
  )

Stations Map - after filtering

Now, let’s remove the SubRegions that don’t contain at least one “Core” station. We’ll then remove the stations within these under-sampled SubRegions.

ndf_dwq_lt2 <- ndf_dwq_lt2 %>% 
  mutate(
    df_data_filt2 = map2(
      df_data_filt, df_stations, 
      ~ left_join(.x, .y, by = c("Source", "Station")) %>% 
        mutate(StationGrp = factor(StationGrp, levels = c("Secondary", "Core"), ordered = TRUE)) %>% 
        group_by(SubRegion) %>% 
        mutate(StationGrp_Max = max(StationGrp)) %>% 
        ungroup() %>% 
        filter(StationGrp_Max == "Core") %>% 
        mutate(StationGrp = as.character(StationGrp))
    )
  ) 

Let’s look at the map of “Core” and “Secondary” stations for water temperature after removing any stations and SubRegions from the analysis.

sf_sta_wt_after <- ndf_dwq_lt2$df_data_filt2[[1]] %>% 
  distinct(Source, Station, Latitude, Longitude, SubRegion, StationGrp) %>% 
  st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE) %>% 
  filter(!(Source == "EMP" & str_detect(Station, "^EZ")))

# Remove under-sampled SubRegions from the Region polygon
sf_delta_c_4326_filt <- sf_delta_c_4326 %>% filter(SubRegion %in% unique(sf_sta_wt_after$SubRegion))

# Create map using leaflet
leaflet() %>% 
  addTiles() %>%
  addCircleMarkers(
    data = sf_sta_wt_after,
    radius = 5,
    fillColor = ~color_pal_sta_grp(StationGrp),
    fillOpacity = 0.8,
    weight = 0.5,
    color = "black",
    opacity = 1,
    label = paste0("Survey: ", sf_sta_wt_after$Source, ", Station: ", sf_sta_wt_after$Station)
  ) %>% 
  addPolylines(data = sf_delta_c_4326_filt) %>%
  addLegend(
    position = "topright",
    pal = color_pal_sta_grp,
    values = sf_sta_wt_after$StationGrp,
    title = "Station Group:"
  )

Hmmm, now there is a gigantic hole missing in the North-Central Delta. Let’s see how this compares to what we did in the long-term analysis for the 2021 Drought Synthesis report.

SubRegion Comparison

# Import Chlorophyll data used in the long-term analysis for the 2021 Drought
  # Synthesis report and add lat-long coordinates
df_chla_ds_lt <- read_csv(here("chla_data_stats_LT2.csv")) %>% 
  mutate(Source = if_else(Source == "USGS-SFBRMP", "USGS_SFBS", Source)) %>% 
  left_join(
    ndf_dwq_lt2$df_data_filt[[1]] %>% distinct(Source, Station, Latitude, Longitude),
    by = c("Source", "Station")
  )

# Pull out SubRegions for each parameter after using the filtering steps above
ndf_subregion <- ndf_dwq_lt2 %>% 
  transmute(
    Parameter,
    df_data_filt2,
    df_subreg = map(df_data_filt2, ~ distinct(.x, SubRegion))
  ) %>% 
  add_row(
    Parameter = c(
      "AllSubregions", 
      "DrtSynthWQ",
      "DrtSynthNutr",
      "DrtSynthChla"
    ),
    df_data_filt2 = list(
      ndf_dwq_lt2$df_data_filt[[1]],
      DroughtData::raw_wq_1975_2021,
      DroughtData::raw_nutr_1975_2021,
      df_chla_ds_lt
    ),
    df_subreg = list(
      # Add all possible SubRegions from sf_delta
      sf_delta %>% distinct(SubRegion),
      # Add SubRegions used in the long-term analysis for the 2021 Drought Synthesis report
      DroughtData::raw_wq_1975_2021 %>% distinct(SubRegion),
      DroughtData::raw_nutr_1975_2021 %>% distinct(SubRegion),
      df_chla_ds_lt %>% distinct(SubRegion)
    ),
    .before = 1
  ) %>% 
  mutate(
    # Create sf objects for the SubRegions and Stations for each parameter
    sf_subreg = map(df_subreg, ~ sf_delta %>% filter(SubRegion %in% unique(.x$SubRegion))),
    sf_stations = map(
      df_data_filt2,
      ~ distinct(.x, Source, Station, Latitude, Longitude) %>% 
        st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326) %>% 
        st_transform(crs = st_crs(sf_delta))
    )
  )

# Create a list of character vectors to define the suffixes for the reducing full join
lst_suffix <- map(ndf_subregion$Parameter[-1], ~ c("", paste0("_", .x)))

# Create a custom function for the full join
join_subregions <- function(df1, df2, suff) {
  full_join(df1, df2, by = "SubRegion", suffix = suff, keep = TRUE)
}

# Define parameter levels
param_levels <- c(
  "DrtSynthWQ",
  "Temperature",
  "Salinity",
  "Secchi",
  "DrtSynthNutr",
  "DissAmmonia",
  "DissNitrateNitrite",
  "DissOrthophos",
  "DrtSynthChla",
  "Chlorophyll"
)

# Run reducing full join and clean up data frame for plotting
df_subregion <- reduce2(ndf_subregion$df_subreg, lst_suffix, join_subregions) %>% 
  mutate(across(contains("_"), ~ if_else(!is.na(.x), "Yes", "No"))) %>% 
  rename_with(~ str_extract(.x, "(?<=_).+"), contains("_")) %>% 
  pivot_longer(cols = -SubRegion, names_to = "Parameter", values_to = "Included") %>% 
  mutate(
    Parameter = factor(Parameter, levels = param_levels),
    SubRegion = fct_rev(factor(SubRegion))
  )

# Create a function for the SubRegion comparison tile plots
plot_reg_comp <- function(df) {
  df %>% 
    ggplot(aes(x = Parameter, y = SubRegion, fill = Included)) + 
    geom_tile()
}

Tile plots

All Parameters

Water Quality Parameters

Nutrients

Chlorophyll

Maps

# Transform WW_Delta shapefile to the CRS of sf_delta
WW_Delta_26910 <- st_transform(WW_Delta, crs = st_crs(sf_delta))

# Define the bounding box for sf_delta with a 2.5 km buffer
bbox_delta <- st_bbox(st_buffer(sf_delta, 2500))

# Create a function for the SubRegion comparison maps
map_reg_comp <- function(sf_reg, sf_sta) {
  ggplot() +
    geom_sf(
      data = WW_Delta_26910,
      color = "lightskyblue",
      fill = "lightskyblue",
      size = 0.1
    ) +
    geom_sf(
      data = sf_reg, 
      fill = "brown", 
      alpha = 0.2,
      size = 0.5
    ) +
    geom_sf(data = sf_sta) +
    coord_sf(
      xlim = c(bbox_delta$xmin, bbox_delta$xmax),
      ylim = c(bbox_delta$ymin, bbox_delta$ymax)
    ) +
    theme_bw()
}

# Add AllSubregions to param_levels
param_levels2 <- c("AllSubregions", param_levels)

# Create SubRegion comparison maps
ndf_subregion_maps <- ndf_subregion %>% 
  mutate(
    plt_map = map2(sf_subreg, sf_stations, map_reg_comp),
    Parameter = factor(Parameter, levels = param_levels2)
  ) %>% 
  arrange(Parameter)
AllSubregions

DrtSynthWQ

Temperature

Salinity

Secchi

DrtSynthNutr

DissAmmonia

DissNitrateNitrite

DissOrthophos

DrtSynthChla

Chlorophyll